This document is the first in a series for a project looking at a simple method for pricing car insurance based on claims data. This rmarkdown document focuses on loading the initial data and performing some systematic data exploration and cleaning.

1 Load Data

### Set up the data
data(freMTPLfreq)
data(freMTPLsev)
data(freMTPL2freq)
data(freMTPL2sev)

policy1_dt <- copy(freMTPLfreq)
claims1_dt <- copy(freMTPLsev)
policy2_dt <- copy(freMTPL2freq)
claims2_dt <- copy(freMTPL2sev)

setDT(policy1_dt)
setDT(claims1_dt)
setDT(policy2_dt)
setDT(claims2_dt)

setnames(policy1_dt, c('policy_id','claim_count','exposure','power','car_age'
                      ,'driver_age','brand','fuel','region','density'))
print(policy1_dt)
##         policy_id claim_count    exposure power car_age driver_age
##      1:         1           0 0.090000000     g       0         46
##      2:         2           0 0.840000000     g       0         46
##      3:         3           0 0.520000000     f       2         38
##      4:         4           0 0.450000000     f       2         38
##      5:         5           0 0.150000000     g       0         41
##     ---                                                           
## 413165:    413165           0 0.002739726     j       0         29
## 413166:    413166           0 0.005479452     d       0         29
## 413167:    413167           0 0.005479452     k       0         49
## 413168:    413168           0 0.002739726     d       0         41
## 413169:    413169           0 0.002739726     g       6         29
##                                      brand    fuel             region density
##      1: Japanese (except Nissan) or Korean  Diesel          Aquitaine      76
##      2: Japanese (except Nissan) or Korean  Diesel          Aquitaine      76
##      3: Japanese (except Nissan) or Korean Regular Nord-Pas-de-Calais    3003
##      4: Japanese (except Nissan) or Korean Regular Nord-Pas-de-Calais    3003
##      5: Japanese (except Nissan) or Korean  Diesel   Pays-de-la-Loire      60
##     ---                                                                      
## 413165: Japanese (except Nissan) or Korean  Diesel      Ile-de-France    2471
## 413166: Japanese (except Nissan) or Korean Regular      Ile-de-France    5360
## 413167: Japanese (except Nissan) or Korean  Diesel      Ile-de-France    5360
## 413168: Japanese (except Nissan) or Korean Regular      Ile-de-France    9850
## 413169: Japanese (except Nissan) or Korean  Diesel          Aquitaine      65
setnames(claims1_dt, c('policy_id','claim_amount'))
print(claims1_dt)
##        policy_id claim_amount
##     1:     63987         1172
##     2:    310037         1905
##     3:    314463         1150
##     4:    318713         1220
##     5:    309380        55077
##    ---                       
## 16177:    302759           61
## 16178:    299443         1831
## 16179:    303389         4183
## 16180:    304313          566
## 16181:    206241         2156

2 Initial Data Exploration

Having loaded in the data, we want to look at the basic data types of the columns, along with row and columns counts. We also look at a quick summary of the data.

glimpse(policy1_dt)
## Observations: 413,169
## Variables: 10
## $ policy_id   <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19...
## $ claim_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ exposure    <dbl> 0.09, 0.84, 0.52, 0.45, 0.15, 0.75, 0.81, 0.05, 0.76, 0.34, 0.10,...
## $ power       <fct> g, g, f, f, g, g, d, d, d, i, f, f, e, e, e, e, e, e, i, i, h, h,...
## $ car_age     <int> 0, 0, 2, 2, 0, 0, 1, 0, 9, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ driver_age  <int> 46, 46, 38, 38, 41, 41, 27, 27, 23, 44, 32, 32, 33, 33, 33, 54, 6...
## $ brand       <fct> Japanese (except Nissan) or Korean, Japanese (except Nissan) or K...
## $ fuel        <fct> Diesel, Diesel, Regular, Regular, Diesel, Diesel, Regular, Regula...
## $ region      <fct> Aquitaine, Aquitaine, Nord-Pas-de-Calais, Nord-Pas-de-Calais, Pay...
## $ density     <int> 76, 76, 3003, 3003, 60, 60, 695, 695, 7887, 27000, 23, 23, 1746, ...
summary(policy1_dt)
##    policy_id       claim_count         exposure            power          car_age       
##  1      :     1   Min.   :0.00000   Min.   :0.002732   f      :95718   Min.   :  0.000  
##  2      :     1   1st Qu.:0.00000   1st Qu.:0.200000   g      :91198   1st Qu.:  3.000  
##  3      :     1   Median :0.00000   Median :0.540000   e      :77022   Median :  7.000  
##  4      :     1   Mean   :0.03916   Mean   :0.561088   d      :68014   Mean   :  7.532  
##  5      :     1   3rd Qu.:0.00000   3rd Qu.:1.000000   h      :26698   3rd Qu.: 12.000  
##  6      :     1   Max.   :4.00000   Max.   :1.990000   j      :18038   Max.   :100.000  
##  (Other):413163                                        (Other):36481                    
##    driver_age                                   brand             fuel       
##  Min.   :18.00   Fiat                              : 16723   Diesel :205945  
##  1st Qu.:34.00   Japanese (except Nissan) or Korean: 79060   Regular:207224  
##  Median :44.00   Mercedes, Chrysler or BMW         : 19280                   
##  Mean   :45.32   Opel, General Motors or Ford      : 37402                   
##  3rd Qu.:54.00   other                             :  9866                   
##  Max.   :99.00   Renault, Nissan or Citroen        :218200                   
##                  Volkswagen, Audi, Skoda or Seat   : 32638                   
##                 region          density     
##  Centre            :160601   Min.   :    2  
##  Ile-de-France     : 69791   1st Qu.:   67  
##  Bretagne          : 42122   Median :  287  
##  Pays-de-la-Loire  : 38751   Mean   : 1985  
##  Aquitaine         : 31329   3rd Qu.: 1410  
##  Nord-Pas-de-Calais: 27285   Max.   :27000  
##  (Other)           : 43290

The categorical variables here are listed as factors so the first thing I will do is convert them to character strings. Factors can have some strange ‘gotchas’ in how they are used, so it is safe to switch them to character variables at the very start.

NB: I will reverse the previous sentiment and leave these variables as factors for now.

### We use data.table ':=' syntax for this as it is fast and easy to
### understand in this case.
###
### For future data manipulation we will use dplyr for its readability.

#policy1_dt[, power  := as.character(power)]
#policy1_dt[, brand  := as.character(brand)]
#policy1_dt[, fuel   := as.character(fuel)]
#policy1_dt[, region := as.character(region)]

We now create separate vectors for the numerical and categorical variables so we can automatically generate different exploratory plots of the data.

vars_num <- c('claim_count','exposure','car_age','driver_age','density')

vars_cat <- c('power','brand','fuel','region')

2.1 Univariate Data Exploration

We create simple univariate exploratory plots.

2.1.1 Numeric Variables

We iterate through the numeric variables, looking at a density plot for each one.

for(plot_var in vars_num) {
    cat(paste0(plot_var, "\n"))

    explore_plot <- ggplot() +
        geom_density(aes(x = policy1_dt[[plot_var]])) +
        xlab(plot_var) +
        ggtitle(paste0("Density Plot for Variable: ", plot_var))

    plot(explore_plot)
}
## claim_count

## exposure

## car_age

## driver_age

## density

None of these plots seem very useful, so we try the same thing but now use histograms.

for(plot_var in vars_num) {
    cat(paste0(plot_var, "\n"))

    explore_plot <- ggplot() +
        geom_histogram(aes(x = policy1_dt[[plot_var]]), bins = 30) +
        xlab(plot_var) +
        ggtitle(paste0("Bar Plot for Variable: ", plot_var))

    plot(explore_plot)
}
## claim_count

## exposure

## car_age

## driver_age

## density

2.1.2 Categorical Variables

We now iterate through each of the categorical variables by looking at boxplots of the counts of the values.

for(plot_var in vars_cat) {
    cat(paste0(plot_var, "\n"))

    explore_plot <- ggplot() +
        geom_bar(aes(x = policy1_dt[[plot_var]])) +
        xlab(plot_var) +
        ggtitle(paste0("Barplot of Counts for Variable: ", plot_var)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}
## power

## brand

## fuel

## region

2.1.2.1 exposure

The exposure variable is a bit strange - it seems strange to have policies whose duration is longer than a year in this book - but without knowledge of the business it is hard to tell. Let us see how the exposures that are longer than 1 year are distributed.

ggplot(policy1_dt %>% filter(exposure > 1)) +
    geom_density(aes(x = exposure))

We could remove these policies, but I am inclined to leave them in for the moment at least. We may need to revisit this decision later.

2.2 Bivariate Data Exploration

We first see how a pairs plot looks. The size of the dataset makes this computationally onerous, so we sample 50,000 data points and create the pairs plot for those.

pairsplot_count <- 10000

pairsdata_dt <- policy1_dt %>%
    select(-policy_id) %>%
    sample_n(pairsplot_count, replace = FALSE)

ggpairs(pairsdata_dt)

2.2.1 density vs region

Density seems a bit strange, so I want to see how density distributes across the regions as that also seems to be geographic.

First we look at boxplots:

ggplot(policy1_dt) +
    geom_boxplot(aes(x = region, y = density))

Then we do a facetted histogram, facetting by region.

ggplot(policy1_dt) +
    geom_histogram(aes(x = density), bins = 50) +
    facet_wrap(~region, scales = 'free', ncol = 2) +
    ggtitle("density Histogram by region")

2.2.2 claim_count vs region

We will look at claim_count vs region to see if there are any geographic patterns.

ggplot(policy1_dt) +
    geom_bar(aes(x = claim_count)) +
    expand_limits(x = 4) +
    facet_wrap(~region, scales = 'free', ncol = 2) +
    ggtitle("claim_count Barplot by region")

We want to normalise these counts so we can see how many claims we get as a proportion of the policy count in each region, so to do this we first need to calculate this:

policy_region_dt <- policy1_dt %>%
    group_by(region) %>%
    summarise(num_policies = length(policy_id))

policyprop_dt <- policy1_dt %>%
    left_join(policy_region_dt, 'region') %>%
    group_by(region, claim_count) %>%
    summarise(count = length(policy_id)
             ,prop  = length(policy_id) / max(num_policies))

ggplot(policyprop_dt[claim_count > 0]) +
    geom_bar(aes(x = claim_count, y = prop), stat = 'identity') +
    expand_limits(y = 0.05) +
    facet_wrap(~region, ncol = 2) +
    coord_flip() +
    ggtitle("Claim Proportion Barplot by region")

We will also facet across the claim count so we can better compare the values.

ggplot(policyprop_dt[claim_count > 0]) +
    geom_bar(aes(x = region, y = prop), stat = 'identity') +
    expand_limits(x = unique(policyprop_dt$region)) +
    facet_wrap(~claim_count, ncol = 2, scales = 'free') +
    ggtitle("Claim Proportion Barplot by claim_count and region")

2.2.3 region vs car_age

We want to see a distribution of car_age by region in the data:

ggplot(policy1_dt) +
    geom_boxplot(aes(x = region, y = car_age)) +
    ggtitle("Boxplot of car_age by region")

We may need to filter out cars that are exceptionally old.

2.3 Claim Data

First we look at a histogram of the individual claims without aggregating them by policy.

ggplot(claims1_dt) +
    geom_histogram(aes(x = claim_amount), bins = 50)

This does not tell us much due to the skewed nature of the claims, so we instead look at all claims below EUR 25,000:

ggplot(claims1_dt[claim_amount < 25000]) +
    geom_histogram(aes(x = claim_amount), bins = 50) +
    scale_x_continuous(labels = scales::dollar)

Claims above 25,000 are so skewed that we look at these on a separate plot with a logscale on the x-axis.

ggplot(claims1_dt[claim_amount >= 25000]) +
    geom_histogram(aes(x = claim_amount), bins = 50) +
    scale_x_log10(labels = scales::dollar)

To get a sense of the skew in terms of the right tail, we look at a cumulative density plot of the claim amounts:

ggplot(claims1_dt) +
    geom_line(aes(x = seq_along(claim_amount) / length(claim_amount)
                 ,y = sort(claim_amount))) +
    scale_y_log10(labels = scales::dollar) +
    xlab("Cumulative Probability") +
    ylab("Claim Amount")

2.3.1 Aggregate Claims by Policy

We now add up all the claims on a single policy and treat them as a single amount.

claims_amount_dt <- claims1_dt %>%
    group_by(policy_id) %>%
    summarise(num_claim    = length(claim_amount)
             ,total_claims = sum(claim_amount)) %>%
    arrange(-total_claims, -num_claim)

policyclaim_dt <- policy1_dt %>%
    left_join(claims_amount_dt, by = 'policy_id') %>%
    mutate(total_claims = replace(total_claims, is.na(total_claims), 0))

Now we look at the total claims per policy.

ggplot(claims_amount_dt) +
    geom_histogram(aes(x = total_claims), bins = 50) +
    scale_x_log10(labels = scales::dollar)

We first check that the merge worked properly by ensuring that claim_count and num_claim are the same.

policyclaim_dt %>%
    filter(claim_count != num_claim) %>%
    print
## Source: local data table [0 x 12]
## 
## # A tibble: 0 x 12
## # ... with 12 variables: policy_id <fct>, claim_count <int>, exposure <dbl>, power <fct>,
## #   car_age <int>, driver_age <int>, brand <fct>, fuel <fct>, region <fct>,
## #   density <int>, num_claim <int>, total_claims <dbl>

We look at the cumulative claims per policy.

ggplot(claims_amount_dt) +
    geom_line(aes(x = seq_along(total_claims) / length(total_claims)
                 ,y = sort(total_claims))) +
    scale_y_log10(labels = scales::dollar) +
    xlab("Cumulative Probability") +
    ylab("Claim Amount")

2.3.2 claimtotal by region

We do a boxplot of the total claims by region. We first will plot with all the claims to see if there is a regional pattern in the larger claims as we expect these amounts will dominate any visuals.

ggplot(policyclaim_dt[total_claims > 0]) +
    geom_boxplot(aes(x = region, y = total_claims)) +
    scale_y_log10(labels = scales::dollar) +
    ggtitle("Boxplot of Total Claims on a Policy by region")

We now filter out the larger claims and do a boxplot for claims between 0 and 50,000.

ggplot(policyclaim_dt[total_claims > 0 & total_claims < 25000]) +
    geom_boxplot(aes(x = region, y = total_claims)) +
    scale_y_log10(labels = scales::comma) +
    ggtitle("Boxplot of Total Claims on a Policy by region")

2.3.3 Power-law Scaling

We look at the log-log plot of claim size against the cumulative number of claims of at least the size to investigate if the claim frequency obeys a power law.

logclaimsize_seq <- seq(0, 7, by = 0.1)

powerlaw_dt <- data.table(
    logsize = logclaimsize_seq
   ,count   = sapply(logclaimsize_seq, function(iter_m)
                         nrow(claims1_dt[claim_amount > 10^iter_m]))
)

ggplot(powerlaw_dt) +
    geom_line(aes(x = logsize, y = log(count))) +
    xlab("Log of Cumulative Claim Size") +
    ylab("Log of Count")

For claims about 1,000 (\(\log \text{Claim} = 3\)) a straight line could do a good job of fitting the curve, so we look at that

ggplot(powerlaw_dt[logsize >= 3]) +
    geom_line(aes(x = logsize, y = log(count))) +
    geom_smooth(aes(x = logsize, y = log(count)), method = 'lm', se = TRUE) +
    xlab("Log of Cumulative Claim Size") +
    ylab("Log of Count")
## Warning: Removed 7 rows containing non-finite values (stat_smooth).

Encouraged by the above plots, we will model part of the claim distribution with a power law - probably to work on the likelihood of larger claims.

2.4 Univariate Plots Facetted by Claim

Now we split the data into two groups: those policies with no claims and those with at least one claim. We then create some univariate plots of the input data and facet one the claim/noclaim variable to get an idea of any differences between the two groups.

claim_noclaim_dt <- policyclaim_dt %>%
    mutate(claim = claim_count > 0)

Now that we have this data, we do the same thing as before, create the univariate plots of the categorical and numeric variables, and we facet on whether or not the policies have had a claim. This allows us to make direct comparisons across the variables.

As before, we start with the numeric variables first:

for(plot_var in vars_num) {
    cat(paste0(plot_var, "\n"))

    plotdata_dt <- claim_noclaim_dt %>%
        select_(plot_var, "claim") %>%
        mutate_(use_var = plot_var)

    explore_plot <- ggplot(plotdata_dt) +
        geom_histogram(aes(x = use_var), bins = 30) +
        facet_wrap(~claim, scales = 'free_y', nrow = 2) +
        scale_y_continuous(labels = scales::comma) +
        xlab(plot_var) +
        ggtitle(paste0("Claim-facetted Histograms for Variable: ", plot_var))

    plot(explore_plot)
}
## claim_count

## exposure

## car_age

## driver_age

## density

Apart from the obvious distinction between claim counts, there appears to be very little difference across the two groups, so we take a look at categorical variables.

for(plot_var in vars_cat) {
    cat(paste0(plot_var, "\n"))

    plotdata_dt <- claim_noclaim_dt %>%
        select_(plot_var, "claim") %>%
        mutate_(use_var = plot_var)

    explore_plot <- ggplot(plotdata_dt) +
        geom_bar(aes(x = use_var)) +
        facet_wrap(~claim, scales = 'free_y', nrow = 2) +
        scale_y_continuous(labels = scales::comma) +
        xlab(plot_var) +
        ggtitle(paste0("Claim-facetted Barplots of Counts for Variable: ", plot_var)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

    plot(explore_plot)
}
## power

## brand

## fuel

## region

3 Data Cleaning

We now turn our attention to data cleaning and feature creation in the data. We do not have any premium information for the policy data, and may wish to convert some of the features from continuous to categorical - especially for variables such as age where we expect a non-linear influence on the output.

We may exclude data from the analysis if they are outliers.

One issue with removing outliers at this stage is that we are not entirely sure what counts as an outlier. It may be safer for the moment to leave them in and perhaps filter them out just prior to modelling when we have a better sense of what to do. We may wish to keep all the data for moment and split the modelling tasks into different parts, capturing different aspects of the data in different ways.

For the moment, we will leave the data intact.

4 Feature Creation

We turn our attention to adding new variables to our dataset to assist with the modelling. Before we do this, we should look at a summary of the data.

summary(policyclaim_dt)
##    policy_id       claim_count         exposure            power          car_age       
##  1      :     1   Min.   :0.00000   Min.   :0.002732   f      :95718   Min.   :  0.000  
##  2      :     1   1st Qu.:0.00000   1st Qu.:0.200000   g      :91198   1st Qu.:  3.000  
##  3      :     1   Median :0.00000   Median :0.540000   e      :77022   Median :  7.000  
##  4      :     1   Mean   :0.03916   Mean   :0.561088   d      :68014   Mean   :  7.532  
##  5      :     1   3rd Qu.:0.00000   3rd Qu.:1.000000   h      :26698   3rd Qu.: 12.000  
##  6      :     1   Max.   :4.00000   Max.   :1.990000   j      :18038   Max.   :100.000  
##  (Other):413163                                        (Other):36481                    
##    driver_age                                   brand             fuel       
##  Min.   :18.00   Fiat                              : 16723   Diesel :205945  
##  1st Qu.:34.00   Japanese (except Nissan) or Korean: 79060   Regular:207224  
##  Median :44.00   Mercedes, Chrysler or BMW         : 19280                   
##  Mean   :45.32   Opel, General Motors or Ford      : 37402                   
##  3rd Qu.:54.00   other                             :  9866                   
##  Max.   :99.00   Renault, Nissan or Citroen        :218200                   
##                  Volkswagen, Audi, Skoda or Seat   : 32638                   
##                 region          density        num_claim       total_claims      
##  Centre            :160601   Min.   :    2   Min.   :1.0      Min.   :      0.0  
##  Ile-de-France     : 69791   1st Qu.:   67   1st Qu.:1.0      1st Qu.:      0.0  
##  Bretagne          : 42122   Median :  287   Median :1.0      Median :      0.0  
##  Pays-de-la-Loire  : 38751   Mean   : 1985   Mean   :1.1      Mean   :     83.4  
##  Aquitaine         : 31329   3rd Qu.: 1410   3rd Qu.:1.0      3rd Qu.:      0.0  
##  Nord-Pas-de-Calais: 27285   Max.   :27000   Max.   :4.0      Max.   :2036833.0  
##  (Other)           : 43290                   NA's   :397779

From our initial data exploration in the previous document, we have a few manipulations that may be worthwhile. We will bin some of the numeric variables, and we might combine a number of levels in some categorical variables to reduce the amount of work required.

4.1 Binning Continuous Variables

We aggregate a few of the continuous features that are unlikely to have any kind of linear response in terms of the data: driver_age, car_age and density.

We have picked a somewhat arbitrary set of cutoffs to discretise the variables for these three variables and will check their use in the models we build.

policyclaim_dt <- policyclaim_dt %>%
    mutate(cat_driver_age = cut(driver_age, c(17,22,26,42,74,Inf))
          ,cat_car_age    = cut(car_age,    c(0,1,4,15,Inf)
                               ,include.lowest = TRUE)
          ,cat_density    = cut(density, c(0,40,200,500,4500,Inf)
                               ,include.lowest = TRUE)) %>%
    mutate(cat_driver_age = as.character(cat_driver_age)
          ,cat_car_age    = as.character(cat_car_age)
          ,cat_density    = as.character(cat_density))

glimpse(policyclaim_dt)
## Observations: 413,169
## Variables: 15
## $ policy_id      <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,...
## $ claim_count    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ exposure       <dbl> 0.09, 0.84, 0.52, 0.45, 0.15, 0.75, 0.81, 0.05, 0.76, 0.34, 0....
## $ power          <fct> g, g, f, f, g, g, d, d, d, i, f, f, e, e, e, e, e, e, i, i, h,...
## $ car_age        <int> 0, 0, 2, 2, 0, 0, 1, 0, 9, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ driver_age     <int> 46, 46, 38, 38, 41, 41, 27, 27, 23, 44, 32, 32, 33, 33, 33, 54...
## $ brand          <fct> Japanese (except Nissan) or Korean, Japanese (except Nissan) o...
## $ fuel           <fct> Diesel, Diesel, Regular, Regular, Diesel, Diesel, Regular, Reg...
## $ region         <fct> Aquitaine, Aquitaine, Nord-Pas-de-Calais, Nord-Pas-de-Calais, ...
## $ density        <int> 76, 76, 3003, 3003, 60, 60, 695, 695, 7887, 27000, 23, 23, 174...
## $ num_claim      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ total_claims   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cat_driver_age <chr> "(42,74]", "(42,74]", "(26,42]", "(26,42]", "(26,42]", "(26,42...
## $ cat_car_age    <chr> "[0,1]", "[0,1]", "(1,4]", "(1,4]", "[0,1]", "[0,1]", "[0,1]",...
## $ cat_density    <chr> "(40,200]", "(40,200]", "(500,4.5e+03]", "(500,4.5e+03]", "(40...

We will attempt to build models using both continuous and binned versions of this data and compare the performance of them all.

4.2 Aggregative Categorical Levels

A number of our categorical variables have long tails: they have a reasonable number of values with small counts. This can cause an issue as parameter estimates for these levels may lack robustness and uncertainty limits are likely to be wide. To help with this, we often create a ‘catch-all’ value and aggregate all levels below a certain count to be this ‘catch-all’ value.

4.2.1 power

ggplot(policyclaim_dt) +
    geom_bar(aes(x = power)) +
    xlab('Value') +
    ggtitle("Barplot of Counts for Variable power")

There is quite a long tail for the higher letters, so we redo this plot showing the total count of the policies as we account for additional power levels. This should give us a sense for the point at which we agglomerate the levels into a single value.

plot_dt <- policyclaim_dt %>%
    group_by(power) %>%
    summarise(count = length(policy_id)) %>%
    arrange(-count) %>%
    summarise(power, cumlcount = cumsum(count))

plot_dt$power <- factor(plot_dt$power, levels = plot_dt$power)

ggplot(plot_dt) +
    geom_bar(aes(x = power, y = cumlcount), stat = 'identity')

Looking this plot, we see that the levels from \(i\) on in the plot can be aggregated. We combine \((i,k,l,m,o,n)\) into a single level other.

power_other <- c('i','k','l','m','o','n')

policyclaim_dt <- policyclaim_dt %>%
    mutate(agg_power = ifelse(power %in% power_other, 'other', power))

ggplot(policyclaim_dt) +
    geom_bar(aes(x = agg_power)) +
    ggtitle("Barplot of new variable: agg_power")

We may have been slightly too aggressive with this, so just in case, we create a new variable agg_power_2 where we keep value \(i\) separate and aggregate the others.

power_other <- c('k','l','m','o','n')

policyclaim_dt <- policyclaim_dt %>%
    mutate(agg_power_2 = ifelse(power %in% power_other, 'other', power))

ggplot(policyclaim_dt) +
    geom_bar(aes(x = agg_power_2)) +
    ggtitle("Barplot of new variable: agg_power_2")

agg_power_2 would appear to be a better aggregation of levels in terms of balanced counts - though it should be said that this may not be in anyway good or desirable.

4.2.2 region

The region variable is imbalanced, so we give it similar treatment.

plot_dt <- policyclaim_dt %>%
    group_by(region) %>%
    summarise(count = length(policy_id)) %>%
    arrange(-count) %>%
    summarise(region, cumlcount = cumsum(count))

plot_dt$region <- factor(plot_dt$region, levels = plot_dt$region)

ggplot(plot_dt) +
    geom_bar(aes(x = region, y = cumlcount), stat = 'identity')

We will try to aggregate up the last three values: \(R25\), \(R23\) and \(R74\):

region_other <- c('R25','R23','R74')

policyclaim_dt <- policyclaim_dt %>%
    mutate(agg_region = ifelse(region %in% region_other, 'other', region))

ggplot(policyclaim_dt) +
    geom_bar(aes(x = agg_region)) +
    ggtitle("Barplot of new variable: agg_region")

While not balanced, agg_region has much less of a tail. It will be interesting to see if this aggregation has any effect on model performance.

5 Write to Disk

We have done some chopping and munging with this data, and we wish to preserve some of this work across the documents so we save them to disk in both CSV and feather format.

### We first drop variable num_claim as it repeats claim_count
policyclaim_dt <- policyclaim_dt %>% select(-num_claim)


write_csv(policy1_dt,         path = 'data/policy_data.csv')
write_csv(claims1_dt,         path = 'data/claim_data.csv')
write_csv(policyclaim_dt,     path = 'data/policyclaim_data.csv')

write_rds(policy1_dt,         path = 'data/policy_dt.rds')
write_rds(claims1_dt,         path = 'data/claim_dt.rds')
write_rds(policyclaim_dt,     path = 'data/policyclaim_dt.rds')

write_feather(policy1_dt,     path = 'data/policy_data.feather')
write_feather(claims1_dt,     path = 'data/claim_data.feather')
write_feather(policyclaim_dt, path = 'data/policyclaim_data.feather')

6 R Environment

sessioninfo::session_info()
## ─ Session info ─────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.5.1 (2018-07-02)
##  os       Debian GNU/Linux 9 (stretch)
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       Etc/UTC                     
##  date     2020-03-27                  
## 
## ─ Packages ─────────────────────────────────────────────────────────────────────────────
##  package      * version date       lib source        
##  assertthat     0.2.0   2017-04-11 [1] CRAN (R 3.5.1)
##  backports      1.1.3   2018-12-14 [1] CRAN (R 3.5.1)
##  bindr          0.1.1   2018-03-13 [1] CRAN (R 3.5.1)
##  bindrcpp       0.2.2   2018-03-29 [1] CRAN (R 3.5.1)
##  broom          0.5.1   2018-12-05 [1] CRAN (R 3.5.1)
##  CASdatasets  * 1.0-10  2020-03-26 [1] local         
##  cellranger     1.1.0   2016-07-27 [1] CRAN (R 3.5.1)
##  cli            1.0.1   2018-09-25 [1] CRAN (R 3.5.1)
##  codetools      0.2-15  2016-10-05 [2] CRAN (R 3.5.1)
##  colorspace     1.3-2   2016-12-14 [1] CRAN (R 3.5.1)
##  crayon         1.3.4   2017-09-16 [1] CRAN (R 3.5.1)
##  data.table   * 1.11.8  2018-09-30 [1] CRAN (R 3.5.1)
##  digest         0.6.18  2018-10-10 [1] CRAN (R 3.5.1)
##  dplyr        * 0.7.8   2018-11-10 [1] CRAN (R 3.5.1)
##  dtplyr       * 0.0.2   2017-04-21 [1] CRAN (R 3.5.1)
##  evaluate       0.12    2018-10-09 [1] CRAN (R 3.5.1)
##  fansi          0.4.0   2018-10-05 [1] CRAN (R 3.5.1)
##  feather      * 0.3.1   2016-11-09 [1] CRAN (R 3.5.1)
##  forcats      * 0.3.0   2018-02-19 [1] CRAN (R 3.5.1)
##  generics       0.0.2   2018-11-29 [1] CRAN (R 3.5.1)
##  GGally       * 1.4.0   2018-05-17 [1] CRAN (R 3.5.1)
##  ggplot2      * 3.1.0   2018-10-25 [1] CRAN (R 3.5.1)
##  glue           1.3.0   2018-07-17 [1] CRAN (R 3.5.1)
##  gtable         0.2.0   2016-02-26 [1] CRAN (R 3.5.1)
##  haven          2.0.0   2018-11-22 [1] CRAN (R 3.5.1)
##  hms            0.4.2   2018-03-10 [1] CRAN (R 3.5.1)
##  htmltools      0.3.6   2017-04-28 [1] CRAN (R 3.5.1)
##  httr           1.4.0   2018-12-11 [1] CRAN (R 3.5.1)
##  jsonlite       1.6     2018-12-07 [1] CRAN (R 3.5.1)
##  knitr          1.21    2018-12-10 [1] CRAN (R 3.5.1)
##  labeling       0.3     2014-08-23 [1] CRAN (R 3.5.1)
##  lattice        0.20-35 2017-03-25 [2] CRAN (R 3.5.1)
##  lazyeval       0.2.1   2017-10-29 [1] CRAN (R 3.5.1)
##  lubridate      1.7.4   2018-04-11 [1] CRAN (R 3.5.1)
##  magrittr       1.5     2014-11-22 [1] CRAN (R 3.5.1)
##  modelr         0.1.2   2018-05-11 [1] CRAN (R 3.5.1)
##  munsell        0.5.0   2018-06-12 [1] CRAN (R 3.5.1)
##  nlme           3.1-137 2018-04-07 [2] CRAN (R 3.5.1)
##  pillar         1.3.1   2018-12-15 [1] CRAN (R 3.5.1)
##  pkgconfig      2.0.2   2018-08-16 [1] CRAN (R 3.5.1)
##  plyr           1.8.4   2016-06-08 [1] CRAN (R 3.5.1)
##  purrr        * 0.2.5   2018-05-29 [1] CRAN (R 3.5.1)
##  R6             2.3.0   2018-10-04 [1] CRAN (R 3.5.1)
##  RColorBrewer   1.1-2   2014-12-07 [1] CRAN (R 3.5.1)
##  Rcpp           1.0.0   2018-11-07 [1] CRAN (R 3.5.1)
##  readr        * 1.3.0   2018-12-11 [1] CRAN (R 3.5.1)
##  readxl         1.2.0   2018-12-19 [1] CRAN (R 3.5.1)
##  reshape        0.8.8   2018-10-23 [1] CRAN (R 3.5.1)
##  reshape2       1.4.3   2017-12-11 [1] CRAN (R 3.5.1)
##  rlang          0.3.0.1 2018-10-25 [1] CRAN (R 3.5.1)
##  rmarkdown      1.11    2018-12-08 [1] CRAN (R 3.5.1)
##  rstudioapi     0.8     2018-10-02 [1] CRAN (R 3.5.1)
##  rvest          0.3.2   2016-06-17 [1] CRAN (R 3.5.1)
##  scales         1.0.0   2018-08-09 [1] CRAN (R 3.5.1)
##  sessioninfo    1.1.1   2018-11-05 [1] CRAN (R 3.5.1)
##  sp           * 1.3-1   2018-06-05 [1] CRAN (R 3.5.1)
##  stringi        1.2.4   2018-07-20 [1] CRAN (R 3.5.1)
##  stringr      * 1.3.1   2018-05-10 [1] CRAN (R 3.5.1)
##  tibble       * 1.4.2   2018-01-22 [1] CRAN (R 3.5.1)
##  tidyr        * 0.8.2   2018-10-28 [1] CRAN (R 3.5.1)
##  tidyselect     0.2.5   2018-10-11 [1] CRAN (R 3.5.1)
##  tidyverse    * 1.2.1   2017-11-14 [1] CRAN (R 3.5.1)
##  utf8           1.1.4   2018-05-24 [1] CRAN (R 3.5.1)
##  withr          2.1.2   2018-03-15 [1] CRAN (R 3.5.1)
##  xfun           0.4     2018-10-23 [1] CRAN (R 3.5.1)
##  xml2           1.2.0   2018-01-24 [1] CRAN (R 3.5.1)
##  xts          * 0.11-2  2018-11-05 [1] CRAN (R 3.5.1)
##  yaml           2.2.0   2018-07-25 [1] CRAN (R 3.5.1)
##  zoo          * 1.8-4   2018-09-19 [1] CRAN (R 3.5.1)
## 
## [1] /usr/local/lib/R/site-library
## [2] /usr/local/lib/R/library